library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)
library(tis)

options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'

sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'

Load SF and Alameda case data and Alameda visits data.

# block groups
sf_blockgroups <-
  block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

ac_blockgroups <-
  block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

sf_bgs <- sf_blockgroups %>% pull(GEOID)

ac_bgs <- ac_blockgroups %>% pull(GEOID)

# zip code areas
zctas_94 <- 
  zctas(cb = F, starts_with = "94") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_95 <- 
  zctas(cb = F, starts_with = "95") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_94_95 <- rbind(zctas_94, zctas_95)

# join with block groups
sf_bg_zctas <- sf_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

ac_bg_zctas <- ac_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)


# get SF case data
sf_place_cases <- 
  read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
  filter(county == 'San Francisco')

# get population data for San Francisco
acs_vars = readRDS("/Users/simonespeizer/CEE 218Z/censusData2018_acs_acs5.rds")

# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
    regionString <- paste0("state:06+county:", county)
    censusData <- getCensus(
      name = "acs/acs5",
      vintage = 2018,
      region = "block group:*", 
      regionin = regionString,
      vars = variableToPull
    ) %>%
    mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
      select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
  
  return(censusData)
}

sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)

# getting population by zip code
sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>% 
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

# join with cases and calculate cases per person
sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
  mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
  rename(zipcode = place)

# get Alameda County case data - downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")

# handle the ones that are reported as <10 by calling them 5 cases. Note this is a simplification/choice I made and could be changed.
ac_cases_zip <- ac_place_cases %>% 
  rename(date = DtCreate) %>%
  mutate(date = date %>%  substr(1,10) %>% as.Date()) %>%
  dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
  gather(key = "zipcode", value = "cases", -date) %>%
  mutate(cases = ifelse(cases == "<10", "5", cases), 
         zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
  mutate(cases = as.numeric(cases)) %>%
  arrange(date)

# get Alameda County populations by zip code
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)

# calculate population by zip code
ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>% 
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

# join with cases
ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
  mutate(cases_by_pop = cases / total_pop_zip)

ac_daily_visits_zip_1 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))

ac_daily_visits_zip_2 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_05-25-20_05-31-20.rds"))

ac_daily_visits_zip <- rbind(ac_daily_visits_zip_1, ac_daily_visits_zip_2)

# calculate cumulative visits over time
ac_cum_visits_zip <- ac_daily_visits_zip %>% 
  mutate(total_visits_avg = replace_na(total_visits_avg, 0)) %>% # replace NAs with zero so they don't lead to NAs all following that date
  group_by(zipcode) %>%
  mutate(cumulative_visits = cumsum(total_visits_avg)) %>%
  left_join(ac_pop_zip) %>%
  mutate(cum_visits_per_capita = cumulative_visits / total_pop_zip)

Slopes of cumulative curves

Testing out using past data to predict future data, for a particular zip code. I use 94606.

zip_select <- "94606"

ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select)

ac_cum_visits_zip_select <- ac_cum_visits_zip %>% filter(zipcode == zip_select)

ac_cum_visits_zip_select %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
ac_cases_zip_select %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative cases per person'))

For this zip code, it looks like there is a change in case growth rate after May 1st relative to before May 1st. I will assume a lag of 14 days on the visits data, and look at the slope of the cumulative visits data from April 3rd to April 16th, and April 17th through May 1st.

change_date <- as.Date("2020-05-01")
lag <- 14
date_range_width <- 14
start_date <- change_date - lag - date_range_width
break_date <- change_date - lag
end_date <- change_date - lag + date_range_width
ac_cum_visits_zip_select_1 <- ac_cum_visits_zip_select %>% filter(date < break_date & date >= start_date)
ac_cum_visits_zip_select_2 <- ac_cum_visits_zip_select %>% filter(date >= break_date & date <= end_date)

plot_ly() %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
model_1 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_1)
summary(model_1)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.186179 -0.043693 -0.008393  0.070765  0.184887 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.088e+04  1.192e+02  -91.23   <2e-16 ***
## date         5.937e-01  6.494e-03   91.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09795 on 12 degrees of freedom
## Multiple R-squared:  0.9986, Adjusted R-squared:  0.9984 
## F-statistic:  8359 on 1 and 12 DF,  p-value: < 2.2e-16
model_2 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_2)
summary(model_2)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.074767 -0.030712 -0.002396  0.019025  0.099361 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.225e+04  4.979e+01  -246.0   <2e-16 ***
## date         6.685e-01  2.710e-03   246.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04534 on 13 degrees of freedom
## Multiple R-squared:  0.9998, Adjusted R-squared:  0.9998 
## F-statistic: 6.087e+04 on 1 and 13 DF,  p-value: < 2.2e-16
plot_ly() %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = fitted(model_1), mode = 'lines', name = "model 1", text = paste("slope:", summary.lm(model_1)$coef[2])) %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = fitted(model_2), mode = 'lines', name = "model 2", text = paste("slope:", summary.lm(model_2)$coef[2])) %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))

There is a difference, not huge, but it does exist.

Try this again but using more data (3 week range instead of 2)

change_date <- as.Date("2020-05-01")
lag <- 14
date_range_width <- 21
start_date <- change_date - lag - date_range_width
break_date <- change_date - lag
end_date <- change_date - lag + date_range_width
ac_cum_visits_zip_select_1 <- ac_cum_visits_zip_select %>% filter(date < break_date & date >= start_date)
ac_cum_visits_zip_select_2 <- ac_cum_visits_zip_select %>% filter(date >= break_date & date <= end_date)

plot_ly() %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
model_1 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_1)
summary(model_1)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24410 -0.11527  0.02113  0.07267  0.30255 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.140e+04  1.055e+02  -108.1   <2e-16 ***
## date         6.224e-01  5.748e-03   108.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1595 on 19 degrees of freedom
## Multiple R-squared:  0.9984, Adjusted R-squared:  0.9983 
## F-statistic: 1.172e+04 on 1 and 19 DF,  p-value: < 2.2e-16
model_2 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_2)
summary(model_2)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06848 -0.03712 -0.01818  0.02374  0.15295 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.217e+04  3.563e+01  -341.6   <2e-16 ***
## date         6.642e-01  1.939e-03   342.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05769 on 20 degrees of freedom
## Multiple R-squared:  0.9998, Adjusted R-squared:  0.9998 
## F-statistic: 1.174e+05 on 1 and 20 DF,  p-value: < 2.2e-16
plot_ly() %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = fitted(model_1), mode = 'lines', name = "model 1", text = paste("slope:", summary.lm(model_1)$coef[2])) %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = fitted(model_2), mode = 'lines', name = "model 2", text = paste("slope:", summary.lm(model_2)$coef[2])) %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))

Try with a different zip code.

# make this into a function
zipSelectFindSlopes <- function(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select) {
start_date <- change_date - lag - date_range_width
break_date <- change_date - lag
end_date <- change_date - lag + date_range_width
ac_cum_visits_zip_select_1 <- ac_cum_visits_zip_select %>% filter(date < break_date & date >= start_date)
ac_cum_visits_zip_select_2 <- ac_cum_visits_zip_select %>% filter(date >= break_date & date <= end_date)

plot_ly() %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))

model_1 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_1)
print(summary(model_1))

model_2 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_2)
print(summary(model_2))

plot_ly() %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
  add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = fitted(model_1), mode = 'lines', name = "model 1", text = paste("slope:", summary.lm(model_1)$coef[2])) %>%
  add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = fitted(model_2), mode = 'lines', name = "model 2", text = paste("slope:", summary.lm(model_2)$coef[2])) %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
}

# select a new zip code
zip_select <- "94603"

ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select)

ac_cum_visits_zip_select <- ac_cum_visits_zip %>% filter(zipcode == zip_select)

ac_cum_visits_zip_select %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
ac_cases_zip_select %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative cases per person'))

This zip code seems to have a change in slope around May 10th. Again using 14 day lag.

change_date <- as.Date("2020-05-10")
lag <- 14
date_range_width <- 14
zipSelectFindSlopes(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11836 -0.07764 -0.02610  0.04741  0.19391 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.307e+04  1.263e+02  -103.5   <2e-16 ***
## date         7.132e-01  6.873e-03   103.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1037 on 12 degrees of freedom
## Multiple R-squared:  0.9989, Adjusted R-squared:  0.9988 
## F-statistic: 1.077e+04 on 1 and 12 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.128067 -0.034061  0.001394  0.034022  0.165544 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.195e+04  9.231e+01  -129.5   <2e-16 ***
## date         6.523e-01  5.021e-03   129.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08401 on 13 degrees of freedom
## Multiple R-squared:  0.9992, Adjusted R-squared:  0.9992 
## F-statistic: 1.688e+04 on 1 and 13 DF,  p-value: < 2.2e-16

Hmm, it’s the reverse of what we would expect for that one. Maybe a larger lag is necessary?

change_date <- as.Date("2020-05-10")
lag <- 20
date_range_width <- 14
zipSelectFindSlopes(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25383 -0.05346  0.04665  0.09272  0.17186 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.272e+04  1.702e+02  -74.77   <2e-16 ***
## date         6.943e-01  9.266e-03   74.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1398 on 12 degrees of freedom
## Multiple R-squared:  0.9979, Adjusted R-squared:  0.9977 
## F-statistic:  5614 on 1 and 12 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12053 -0.02938 -0.01381  0.03765  0.09339 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.241e+04  6.546e+01  -189.5   <2e-16 ***
## date         6.770e-01  3.562e-03   190.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0596 on 13 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9996 
## F-statistic: 3.614e+04 on 1 and 13 DF,  p-value: < 2.2e-16

Not quite right for this one still.

Try 94601.

zip_select <- "94560"

ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select)

ac_cum_visits_zip_select <- ac_cum_visits_zip %>% filter(zipcode == zip_select)

ac_cum_visits_zip_select %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
ac_cases_zip_select %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative cases per person'))

This one looks like it has picked up as of May 20th.

change_date <- as.Date("2020-05-20")
lag <- 14
date_range_width <- 14
zipSelectFindSlopes(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select)
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05385 -0.03211 -0.01497  0.03954  0.09100 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.314e+04  5.725e+01  -229.5   <2e-16 ***
## date         7.169e-01  3.115e-03   230.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04698 on 12 degrees of freedom
## Multiple R-squared:  0.9998, Adjusted R-squared:  0.9998 
## F-statistic: 5.296e+04 on 1 and 12 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.079267 -0.040764 -0.001597  0.028882  0.114863 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.321e+04  6.555e+01  -201.5   <2e-16 ***
## date         7.208e-01  3.564e-03   202.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05963 on 13 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 4.091e+04 on 1 and 13 DF,  p-value: < 2.2e-16

A slight increase after the the shift point.

Separating recent and past data

Considering a single zip code and looking at a trendline of past cases and visits, with a lag of 14 days, and seeing if this trend line fits the more recent data.

I will use zip code 94606 again.

zip_select <- "94606"
lag <- 14

ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select) %>%
  mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)))

ac_daily_visits_zip_select <- ac_daily_visits_zip %>% filter(zipcode == zip_select)

ac_daily_visits_lag <- ac_daily_visits_zip_select %>% 
  ungroup() %>%
  dplyr::select(date, total_visits_avg) %>% 
  mutate(lag_date = date + lag) %>%
  dplyr::select(lag_date, total_visits_avg)

ac_visits_cases_select <- left_join(ac_cases_zip_select, ac_daily_visits_lag, by = c("date" = "lag_date")) %>%
  filter(cases >= 10) %>% # only select once cases have started to go above the threshold
  mutate(visits_per_capita = total_visits_avg / total_pop_zip)

ac_visits_cases_select %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))

Try separating this out into all but the most recent week of data, then adding that in.

ac_visits_cases_select_1 <- ac_visits_cases_select %>%
  filter(date < (max(date) - 7))

ac_visits_cases_select_2 <- ac_visits_cases_select %>%
  filter(date >= (max(date) - 7))

model_visits_cases_1 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_select_1)
summary(model_visits_cases_1)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_select_1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.062e-05 -4.655e-05 -1.731e-05  3.558e-05  2.329e-04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)        5.603e-05  4.849e-05   1.156    0.253
## visits_per_capita -1.376e-05  7.315e-05  -0.188    0.851
## 
## Residual standard error: 5.417e-05 on 56 degrees of freedom
## Multiple R-squared:  0.0006317,  Adjusted R-squared:  -0.01721 
## F-statistic: 0.0354 on 1 and 56 DF,  p-value: 0.8514
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = fitted(model_visits_cases_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_2, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))

Doesn’t look great. What if we remove dates on which there were zero new cases?

ac_visits_cases_select_1_no_zero <- ac_visits_cases_select_1 %>% filter(change_cases_by_pop > 0)
ac_visits_cases_select_2_no_zero <- ac_visits_cases_select_2 %>% filter(change_cases_by_pop > 0)

model_visits_cases_1_2 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_select_1_no_zero)
summary(model_visits_cases_1_2)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_select_1_no_zero)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.607e-05 -3.561e-05 -1.600e-05  1.240e-05  2.120e-04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        1.157e-04  5.624e-05   2.056   0.0473 *
## visits_per_capita -6.344e-05  8.410e-05  -0.754   0.4557  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.077e-05 on 35 degrees of freedom
## Multiple R-squared:  0.016,  Adjusted R-squared:  -0.01212 
## F-statistic: 0.569 on 1 and 35 DF,  p-value: 0.4557
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_1_no_zero, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_1_no_zero, x = ~visits_per_capita, y = fitted(model_visits_cases_1_2), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1_2)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_2_no_zero, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))

Try if we look at growth rate (change divided by previous day’s levels).

ac_visits_cases_select_growth <- ac_visits_cases_select %>%
  mutate(past_cases_by_pop = c(NA, ac_visits_cases_select$cases_by_pop[1:(nrow(ac_visits_cases_select)-1)]),
         case_growth_daily = change_cases_by_pop*100 / past_cases_by_pop) %>%
  filter(!is.na(case_growth_daily))

ac_visits_cases_select_growth %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))

This might look a little better.

ac_visits_cases_select_growth_1 <- ac_visits_cases_select_growth %>%
  filter(date < (max(date) - 7))

ac_visits_cases_select_growth_2 <- ac_visits_cases_select_growth %>%
  filter(date >= (max(date) - 7))

model_visits_cases_growth_1 <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1)
summary(model_visits_cases_growth_1)
## 
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1727 -3.6561 -0.7812  2.1856 20.5504 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         -3.549      4.737  -0.749    0.457
## visits_per_capita   11.867      7.106   1.670    0.101
## 
## Residual standard error: 4.904 on 55 degrees of freedom
## Multiple R-squared:  0.04826,    Adjusted R-squared:  0.03096 
## F-statistic: 2.789 on 1 and 55 DF,  p-value: 0.1006
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_growth_2, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))

Removing the zero growth rate values.

model_visits_cases_growth_1_no_zero <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0))
summary(model_visits_cases_growth_1_no_zero)
## 
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1 %>% 
##     filter(case_growth_daily > 0))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3577 -3.0053 -0.7427  0.4916 18.1492 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         -1.448      5.841  -0.248    0.806
## visits_per_capita   12.313      8.656   1.423    0.164
## 
## Residual standard error: 4.648 on 34 degrees of freedom
## Multiple R-squared:  0.05618,    Adjusted R-squared:  0.02842 
## F-statistic: 2.024 on 1 and 34 DF,  p-value: 0.164
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1_no_zero), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_no_zero)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_growth_2 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))

That does look at least in the direction we’d expect, but is not significant still.

Try with one more different zip code. (94603)

zip_select <- "94603"
lag <- 14

ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select) %>%
  mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)))

ac_daily_visits_zip_select <- ac_daily_visits_zip %>% filter(zipcode == zip_select)

ac_daily_visits_lag <- ac_daily_visits_zip_select %>% 
  ungroup() %>%
  dplyr::select(date, total_visits_avg) %>% 
  mutate(lag_date = date + lag) %>%
  dplyr::select(lag_date, total_visits_avg)

ac_visits_cases_select <- left_join(ac_cases_zip_select, ac_daily_visits_lag, by = c("date" = "lag_date")) %>%
  filter(cases >= 10) %>% # only select once cases have started to go above the threshold
  mutate(visits_per_capita = total_visits_avg / total_pop_zip)

ac_visits_cases_select %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))

Separating this out into all but the most recent week of data, then adding that in.

ac_visits_cases_select_1 <- ac_visits_cases_select %>%
  filter(date < (max(date) - 7))

ac_visits_cases_select_2 <- ac_visits_cases_select %>%
  filter(date >= (max(date) - 7))

model_visits_cases_1 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_select_1)
summary(model_visits_cases_1)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_select_1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -8.824e-05 -4.538e-05 -9.027e-06  3.894e-05  1.880e-04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        1.552e-04  6.730e-05   2.307   0.0249 *
## visits_per_capita -1.158e-04  9.786e-05  -1.183   0.2419  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.136e-05 on 55 degrees of freedom
## Multiple R-squared:  0.02481,    Adjusted R-squared:  0.007084 
## F-statistic:   1.4 on 1 and 55 DF,  p-value: 0.2419
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = fitted(model_visits_cases_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_2, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))

Look at growth rate (change divided by previous day’s levels).

ac_visits_cases_select_growth <- ac_visits_cases_select %>%
  mutate(past_cases_by_pop = c(NA, ac_visits_cases_select$cases_by_pop[1:(nrow(ac_visits_cases_select)-1)]),
         case_growth_daily = change_cases_by_pop*100 / past_cases_by_pop) %>%
  filter(!is.na(case_growth_daily))

ac_visits_cases_select_growth %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))

This might look a little better.

ac_visits_cases_select_growth_1 <- ac_visits_cases_select_growth %>%
  filter(date < (max(date) - 7))

ac_visits_cases_select_growth_2 <- ac_visits_cases_select_growth %>%
  filter(date >= (max(date) - 7))

model_visits_cases_growth_1 <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1)
summary(model_visits_cases_growth_1)
## 
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0560 -3.3384 -0.9768  1.3702 17.0075 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -5.724      5.767  -0.993   0.3253  
## visits_per_capita   15.864      8.362   1.897   0.0632 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.152 on 54 degrees of freedom
## Multiple R-squared:  0.06249,    Adjusted R-squared:  0.04513 
## F-statistic: 3.599 on 1 and 54 DF,  p-value: 0.06315
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_growth_2, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))

Removing the zero growth rate values.

model_visits_cases_growth_1_no_zero <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0))
summary(model_visits_cases_growth_1_no_zero)
## 
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1 %>% 
##     filter(case_growth_daily > 0))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3748 -3.1423 -0.9239  1.3312 16.3148 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -8.379      5.837  -1.436    0.158  
## visits_per_capita   21.301      8.508   2.504    0.016 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.93 on 45 degrees of freedom
## Multiple R-squared:  0.1223, Adjusted R-squared:  0.1028 
## F-statistic: 6.268 on 1 and 45 DF,  p-value: 0.01599
plot_ly() %>%
  add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1_no_zero), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_no_zero)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_select_growth_2 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))

Repeating for all zip codes

lag <- 14

ac_cases_zip <- ac_cases_zip %>%
  group_by(zipcode) %>%
  mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)))

ac_daily_visits_lag <- ac_daily_visits_zip %>% 
  mutate(lag_date = date + lag) %>%
  dplyr::select(-date) %>%
  rename(date = lag_date)

ac_visits_cases <- left_join(ac_cases_zip, ac_daily_visits_lag) %>%
  filter(cases >= 10) %>% # only select once cases have started to go above the threshold
  mutate(visits_per_capita = total_visits_avg / total_pop_zip) %>%
  group_by(zipcode) %>%
  mutate(visits_mov = movavg(visits_per_capita, 5, type = "s"), 
         change_cases_by_pop_mov = movavg(change_cases_by_pop, 5, type = "s")) %>% 
  mutate(case_growth = change_cases_by_pop / lag(cases_by_pop)) %>%
  mutate(growth_mov = movavg(case_growth, 5, type = "s")) %>%
  filter(!is.na(cases_by_pop) & !is.na(visits_per_capita) & !is.na(case_growth))

ac_visits_cases %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
ac_visits_cases %>% plot_ly() %>%
  add_trace(x = ~visits_mov, y = ~change_cases_by_pop_mov, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving average'), yaxis = list(title = 'Daily new cases per person, 5 day moving average'))
ac_visits_cases %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~case_growth, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily case growth rate'))
ac_visits_cases %>% plot_ly() %>%
  add_trace(x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers') %>%
  layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving average'), yaxis = list(title = 'Daily case growth rate, 5 day moving average'))

Try separating out by date.

ac_visits_cases_growth_1 <- ac_visits_cases %>%
  filter(date < (max(date) - 7))

ac_visits_cases_growth_2 <- ac_visits_cases %>%
  filter(date >= (max(date) - 7))

model_visits_cases_1 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_growth_1)
summary(model_visits_cases_1)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_growth_1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.169e-05 -3.061e-05 -1.951e-05  1.170e-05  6.748e-04 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.581e-06  5.304e-06   0.675      0.5    
## visits_per_capita 4.186e-05  7.907e-06   5.294 1.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.186e-05 on 2108 degrees of freedom
## Multiple R-squared:  0.01312,    Adjusted R-squared:  0.01265 
## F-statistic: 28.03 on 1 and 2108 DF,  p-value: 1.322e-07
model_visits_cases_1_mov <- lm(change_cases_by_pop_mov ~ visits_mov, ac_visits_cases_growth_1 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)))
summary(model_visits_cases_1_mov)
## 
## Call:
## lm(formula = change_cases_by_pop_mov ~ visits_mov, data = ac_visits_cases_growth_1 %>% 
##     filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.962e-05 -2.477e-05 -1.238e-05  1.201e-05  3.258e-04 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.254e-06  4.512e-06   1.829   0.0675 .  
## visits_mov  3.854e-05  6.698e-06   5.754    1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.975e-05 on 2104 degrees of freedom
## Multiple R-squared:  0.01549,    Adjusted R-squared:  0.01502 
## F-statistic: 33.11 on 1 and 2104 DF,  p-value: 1.001e-08
plot_ly() %>%
  add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~change_cases_by_pop_mov, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)), x = ~visits_mov, y = fitted(model_visits_cases_1_mov), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1_mov)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_growth_2 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~change_cases_by_pop_mov, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving avg'), yaxis = list(title = 'Daily change in cases per person, 5 day moving avg'))
model_visits_cases_growth_1 <- lm(case_growth ~ visits_per_capita, ac_visits_cases_growth_1)
summary(model_visits_cases_growth_1)
## 
## Call:
## lm(formula = case_growth ~ visits_per_capita, data = ac_visits_cases_growth_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.09615 -0.03170 -0.01920  0.01414  0.52340 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.011050   0.005615  -1.968   0.0492 *  
## visits_per_capita  0.065891   0.008371   7.871 5.56e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0549 on 2108 degrees of freedom
## Multiple R-squared:  0.02855,    Adjusted R-squared:  0.02809 
## F-statistic: 61.96 on 1 and 2108 DF,  p-value: 5.556e-15
model_visits_cases_growth_1_mov <- lm(growth_mov ~ visits_mov, ac_visits_cases_growth_1 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)))
summary(model_visits_cases_growth_1_mov)
## 
## Call:
## lm(formula = growth_mov ~ visits_mov, data = ac_visits_cases_growth_1 %>% 
##     filter(!is.na(growth_mov) & !is.na(visits_mov)))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.039150 -0.020670 -0.009317  0.009959  0.185408 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.002746   0.004551  -0.603    0.546    
## visits_mov   0.051473   0.006851   7.513 8.75e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03266 on 1950 degrees of freedom
## Multiple R-squared:  0.02813,    Adjusted R-squared:  0.02763 
## F-statistic: 56.45 on 1 and 1950 DF,  p-value: 8.748e-14
plot_ly() %>%
  add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = fitted(model_visits_cases_growth_1_mov), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_mov)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_growth_2 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving avg'), yaxis = list(title = 'Daily growth rate of cases per person, 5 day moving avg'))

Removing zero values.

ac_visits_cases_growth_1_no_zero <- ac_visits_cases %>%
  filter(date < (max(date) - 7)) %>%
  filter(case_growth > 0)

ac_visits_cases_growth_2_no_zero <- ac_visits_cases %>%
  filter(date >= (max(date) - 7)) %>%
  filter(case_growth > 0)

model_visits_cases_growth_1_no_zero <- lm(case_growth ~ visits_per_capita, ac_visits_cases_growth_1_no_zero)
summary(model_visits_cases_growth_1_no_zero)
## 
## Call:
## lm(formula = case_growth ~ visits_per_capita, data = ac_visits_cases_growth_1_no_zero)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06394 -0.03786 -0.01594  0.01551  0.49307 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.037588   0.009752   3.855 0.000123 ***
## visits_per_capita 0.037966   0.013937   2.724 0.006552 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06412 on 1064 degrees of freedom
## Multiple R-squared:  0.006926,   Adjusted R-squared:  0.005993 
## F-statistic: 7.421 on 1 and 1064 DF,  p-value: 0.006552
model_visits_cases_growth_1_mov_no_zero <- lm(growth_mov ~ visits_mov, ac_visits_cases_growth_1_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)))
summary(model_visits_cases_growth_1_mov_no_zero)
## 
## Call:
## lm(formula = growth_mov ~ visits_mov, data = ac_visits_cases_growth_1_no_zero %>% 
##     filter(!is.na(growth_mov) & !is.na(visits_mov)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04488 -0.02412 -0.01060  0.01222  0.17525 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.017395   0.008185   2.125  0.03382 * 
## visits_mov  0.038586   0.011860   3.253  0.00118 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03706 on 988 degrees of freedom
## Multiple R-squared:  0.0106, Adjusted R-squared:  0.009598 
## F-statistic: 10.58 on 1 and 988 DF,  p-value: 0.001179
plot_ly() %>%
  add_trace(data = ac_visits_cases_growth_1_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "past data") %>%
  add_trace(data = ac_visits_cases_growth_1_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = fitted(model_visits_cases_growth_1_mov_no_zero), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_mov_no_zero)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_growth_2_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "most recent week") %>% 
  layout(title = 'Alameda County by zip code, zeros removed', xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving avg'), yaxis = list(title = 'Daily growth rate of cases per person, 5 day moving avg'))

Summarizing into weekly counts

start_date_weeks <- as.Date("2020-03-28")
ac_visits_cases_weekly <- ac_visits_cases %>%
  mutate(week = floor((date - start_date_weeks) / 7) + 1) %>%
  group_by(zipcode, week) %>%
  summarize(final_cases_by_pop_week = max(cases_by_pop),
            total_visits_per_capita_week = sum(visits_per_capita)) %>%
  group_by(zipcode) %>%
  mutate(change_cases_by_pop_week = c(NA, diff(final_cases_by_pop_week)), 
         case_growth_week = change_cases_by_pop_week / lag(final_cases_by_pop_week), 
         log_cases_by_pop_week = log(final_cases_by_pop_week),
         change_log_cases_by_pop_week = c(NA, diff(log_cases_by_pop_week))) %>% 
  filter(!is.na(case_growth_week)) %>%
  filter(week < max(week)) # remove last week since it wasn't a full week
  
ac_visits_cases_weekly %>% plot_ly() %>%
  add_trace(x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', color = ~zipcode, text = ~week) %>%
  layout(title = "Alameda County", xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in cases per person'))
ac_visits_cases_weekly %>% plot_ly() %>%
  add_trace(x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', color = ~zipcode, text = ~week) %>%
  layout(title = "Alameda County", xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly growth rate of cases per person'))

Look at the week 10 (latest full week) data when we fit a trendline to all other week data.

ac_visits_cases_weekly_1 <- ac_visits_cases_weekly %>% 
  filter(week < max(week))

ac_visits_cases_weekly_2 <- ac_visits_cases_weekly %>% 
  filter(week == max(week))

model_week_1_abs <- lm(change_cases_by_pop_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1)
summary(model_week_1_abs)
## 
## Call:
## lm(formula = change_cases_by_pop_week ~ total_visits_per_capita_week, 
##     data = ac_visits_cases_weekly_1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.741e-04 -1.502e-04 -6.302e-05  6.679e-05  1.820e-03 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                  -9.584e-05  1.071e-04  -0.895  0.37163   
## total_visits_per_capita_week  6.869e-05  2.317e-05   2.964  0.00331 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002618 on 265 degrees of freedom
## Multiple R-squared:  0.03209,    Adjusted R-squared:  0.02844 
## F-statistic: 8.787 on 1 and 265 DF,  p-value: 0.00331
plot_ly() %>%
  add_trace(data = ac_visits_cases_weekly_1, x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "past data", text = ~week) %>%
  add_trace(data = ac_visits_cases_weekly_1 , x = ~total_visits_per_capita_week, y = fitted(model_week_1_abs), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_abs)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_weekly_2, x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>% 
  layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in cases per person'))
model_week_1 <- lm(case_growth_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1)
summary(model_week_1)
## 
## Call:
## lm(formula = case_growth_week ~ total_visits_per_capita_week, 
##     data = ac_visits_cases_weekly_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28221 -0.14975 -0.07356  0.07097  1.47371 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                  -0.06530    0.10674  -0.612  0.54124   
## total_visits_per_capita_week  0.06452    0.02310   2.794  0.00559 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.261 on 265 degrees of freedom
## Multiple R-squared:  0.02861,    Adjusted R-squared:  0.02494 
## F-statistic: 7.804 on 1 and 265 DF,  p-value: 0.005594
plot_ly() %>%
  add_trace(data = ac_visits_cases_weekly_1, x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "past data", text = ~week) %>%
  add_trace(data = ac_visits_cases_weekly_1 , x = ~total_visits_per_capita_week, y = fitted(model_week_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_weekly_2, x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>% 
  layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly growth rate of cases per person'))

Remove zeros.

model_week_1_no_zeros_abs <- lm(change_cases_by_pop_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 0))
summary(model_week_1_no_zeros_abs)
## 
## Call:
## lm(formula = change_cases_by_pop_week ~ total_visits_per_capita_week, 
##     data = ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 
##         0))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.652e-04 -1.547e-04 -7.274e-05  8.047e-05  1.796e-03 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                  -3.756e-05  1.193e-04  -0.315   0.7532  
## total_visits_per_capita_week  5.949e-05  2.563e-05   2.321   0.0211 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002663 on 244 degrees of freedom
## Multiple R-squared:  0.02161,    Adjusted R-squared:  0.0176 
## F-statistic: 5.389 on 1 and 244 DF,  p-value: 0.02109
plot_ly() %>%
  add_trace(data = ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 0), x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "past weeks", text = ~week) %>%
  add_trace(data = ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 0), x = ~total_visits_per_capita_week, y = fitted(model_week_1_no_zeros_abs), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_no_zeros_abs)$r.squared), name = "past week fit") %>% 
  add_trace(data = ac_visits_cases_weekly_2 %>% filter(change_cases_by_pop_week > 0), x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>% 
  layout(title = 'Alameda County by zip code, zeros removed', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in cases per person'))
model_week_1_no_zeros <- lm(case_growth_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1 %>% filter(case_growth_week > 0))
summary(model_week_1_no_zeros)
## 
## Call:
## lm(formula = case_growth_week ~ total_visits_per_capita_week, 
##     data = ac_visits_cases_weekly_1 %>% filter(case_growth_week > 
##         0))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27649 -0.15046 -0.08185  0.06625  1.46041 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                   0.00291    0.11853   0.025   0.9804  
## total_visits_per_capita_week  0.05341    0.02545   2.099   0.0369 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2645 on 244 degrees of freedom
## Multiple R-squared:  0.01773,    Adjusted R-squared:  0.0137 
## F-statistic: 4.404 on 1 and 244 DF,  p-value: 0.03688
plot_ly() %>%
  add_trace(data = ac_visits_cases_weekly_1 %>% filter(case_growth_week > 0), x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "past weeks", text = ~week) %>%
  add_trace(data = ac_visits_cases_weekly_1 %>% filter(case_growth_week > 0), x = ~total_visits_per_capita_week, y = fitted(model_week_1_no_zeros), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_no_zeros)$r.squared), name = "past week fit") %>% 
  add_trace(data = ac_visits_cases_weekly_2 %>% filter(case_growth_week > 0), x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>% 
  layout(title = 'Alameda County by zip code, zeros removed', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly growth rate of cases per person'))

Try with change in log of cases.

model_week_1_log <- lm(change_log_cases_by_pop_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1)
summary(model_week_1_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop_week ~ total_visits_per_capita_week, 
##     data = ac_visits_cases_weekly_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22661 -0.11383 -0.04483  0.06932  0.79698 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                  -0.01958    0.07059  -0.277  0.78173   
## total_visits_per_capita_week  0.04571    0.01527   2.993  0.00303 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1726 on 265 degrees of freedom
## Multiple R-squared:  0.03269,    Adjusted R-squared:  0.02904 
## F-statistic: 8.957 on 1 and 265 DF,  p-value: 0.003025
plot_ly() %>%
  add_trace(data = ac_visits_cases_weekly_1, x = ~total_visits_per_capita_week, y = ~change_log_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "past data", text = ~week) %>%
  add_trace(data = ac_visits_cases_weekly_1 , x = ~total_visits_per_capita_week, y = fitted(model_week_1_log), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_log)$r.squared), name = "past data fit") %>% 
  add_trace(data = ac_visits_cases_weekly_2, x = ~total_visits_per_capita_week, y = ~change_log_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>% 
  layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in log(cases per person)'))